// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_PARSING_SCANNER_INL_H_
#define V8_PARSING_SCANNER_INL_H_

#include "src/char-predicates-inl.h"
#include "src/parsing/keywords-gen.h"
#include "src/parsing/scanner.h"

namespace v8 {
namespace internal {

    // ----------------------------------------------------------------------------
    // Keyword Matcher

#define KEYWORDS(KEYWORD_GROUP, KEYWORD)                      \
    KEYWORD_GROUP('a')                                        \
    KEYWORD("async", Token::ASYNC)                            \
    KEYWORD("await", Token::AWAIT)                            \
    KEYWORD_GROUP('b')                                        \
    KEYWORD("break", Token::BREAK)                            \
    KEYWORD_GROUP('c')                                        \
    KEYWORD("case", Token::CASE)                              \
    KEYWORD("catch", Token::CATCH)                            \
    KEYWORD("class", Token::CLASS)                            \
    KEYWORD("const", Token::CONST)                            \
    KEYWORD("continue", Token::CONTINUE)                      \
    KEYWORD_GROUP('d')                                        \
    KEYWORD("debugger", Token::DEBUGGER)                      \
    KEYWORD("default", Token::DEFAULT)                        \
    KEYWORD("delete", Token::DELETE)                          \
    KEYWORD("do", Token::DO)                                  \
    KEYWORD_GROUP('e')                                        \
    KEYWORD("else", Token::ELSE)                              \
    KEYWORD("enum", Token::ENUM)                              \
    KEYWORD("export", Token::EXPORT)                          \
    KEYWORD("extends", Token::EXTENDS)                        \
    KEYWORD_GROUP('f')                                        \
    KEYWORD("false", Token::FALSE_LITERAL)                    \
    KEYWORD("finally", Token::FINALLY)                        \
    KEYWORD("for", Token::FOR)                                \
    KEYWORD("function", Token::FUNCTION)                      \
    KEYWORD_GROUP('g')                                        \
    KEYWORD("get", Token::GET)                                \
    KEYWORD_GROUP('i')                                        \
    KEYWORD("if", Token::IF)                                  \
    KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
    KEYWORD("import", Token::IMPORT)                          \
    KEYWORD("in", Token::IN)                                  \
    KEYWORD("instanceof", Token::INSTANCEOF)                  \
    KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
    KEYWORD_GROUP('l')                                        \
    KEYWORD("let", Token::LET)                                \
    KEYWORD_GROUP('n')                                        \
    KEYWORD("new", Token::NEW)                                \
    KEYWORD("null", Token::NULL_LITERAL)                      \
    KEYWORD_GROUP('p')                                        \
    KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
    KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
    KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
    KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
    KEYWORD_GROUP('r')                                        \
    KEYWORD("return", Token::RETURN)                          \
    KEYWORD_GROUP('s')                                        \
    KEYWORD("set", Token::SET)                                \
    KEYWORD("static", Token::STATIC)                          \
    KEYWORD("super", Token::SUPER)                            \
    KEYWORD("switch", Token::SWITCH)                          \
    KEYWORD_GROUP('t')                                        \
    KEYWORD("this", Token::THIS)                              \
    KEYWORD("throw", Token::THROW)                            \
    KEYWORD("true", Token::TRUE_LITERAL)                      \
    KEYWORD("try", Token::TRY)                                \
    KEYWORD("typeof", Token::TYPEOF)                          \
    KEYWORD_GROUP('v')                                        \
    KEYWORD("var", Token::VAR)                                \
    KEYWORD("void", Token::VOID)                              \
    KEYWORD_GROUP('w')                                        \
    KEYWORD("while", Token::WHILE)                            \
    KEYWORD("with", Token::WITH)                              \
    KEYWORD_GROUP('y')                                        \
    KEYWORD("yield", Token::YIELD)

    constexpr bool IsKeywordStart(char c)
    {
#define KEYWORD_GROUP_CHECK(ch) c == ch ||
#define KEYWORD_CHECK(keyword, token)
        return KEYWORDS(KEYWORD_GROUP_CHECK, KEYWORD_CHECK) /* || */ false;
#undef KEYWORD_GROUP_CHECK
#undef KEYWORD_CHECK
    }

    V8_INLINE Token::Value KeywordOrIdentifierToken(const uint8_t* input,
        int input_length)
    {
        DCHECK_GE(input_length, 1);
        return PerfectKeywordHash::GetToken(reinterpret_cast<const char*>(input),
            input_length);
    }

    // Recursive constexpr template magic to check if a character is in a given
    // string.
    template <int N>
    constexpr bool IsInString(const char (&s)[N], char c, size_t i = 0)
    {
        return i >= N ? false : s[i] == c ? true : IsInString(s, c, i + 1);
    }

    inline constexpr bool CanBeKeywordCharacter(char c)
    {
        return IsInString(
#define KEYWORD_GROUP_CASE(ch) // Nothing
#define KEYWORD(keyword, token) keyword
            // Use C string literal concatenation ("a" "b" becomes "ab") to build one
            // giant string containing all the keywords.
            KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
#undef KEYWORD
#undef KEYWORD_GROUP_CASE
                ,
            c);
    }

    // Make sure tokens are stored as a single byte.
    STATIC_ASSERT(sizeof(Token::Value) == 1);

    // Get the shortest token that this character starts, the token may change
    // depending on subsequent characters.
    constexpr Token::Value GetOneCharToken(char c)
    {
        // clang-format on
        return c == '(' ? Token::LPAREN : c == ')' ? Token::RPAREN : c == '{' ? Token::LBRACE : c == '}' ? Token::RBRACE : c == '[' ? Token::LBRACK : c == ']' ? Token::RBRACK : c == '?' ? Token::CONDITIONAL : c == ':' ? Token::COLON : c == ';' ? Token::SEMICOLON : c == ',' ? Token::COMMA : c == '.' ? Token::PERIOD : c == '|' ? Token::BIT_OR : c == '&' ? Token::BIT_AND : c == '^' ? Token::BIT_XOR : c == '~' ? Token::BIT_NOT : c == '!' ? Token::NOT : c == '<' ? Token::LT : c == '>' ? Token::GT : c == '%' ? Token::MOD : c == '=' ? Token::ASSIGN : c == '+' ? Token::ADD : c == '-' ? Token::SUB : c == '*' ? Token::MUL : c == '/' ? Token::DIV : c == '#' ? Token::PRIVATE_NAME : c == '"' ? Token::STRING : c == '\'' ? Token::STRING : c == '`' ? Token::TEMPLATE_SPAN : c == '\\' ? Token::IDENTIFIER :
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          // Whitespace or line terminator
                                                                                                                            c == ' ' ? Token::WHITESPACE : c == '\t' ? Token::WHITESPACE : c == '\v' ? Token::WHITESPACE : c == '\f' ? Token::WHITESPACE : c == '\r' ? Token::WHITESPACE : c == '\n' ? Token::WHITESPACE :
                                                                                                                                                                                                                                                                                                     // IsDecimalDigit must be tested before IsAsciiIdentifier
                                                                                                                                                    IsDecimalDigit(c) ? Token::NUMBER : IsAsciiIdentifier(c) ? Token::IDENTIFIER : Token::ILLEGAL;
        // clang-format on
    }

    // Table of one-character tokens, by character (0x00..0x7F only).
    static const constexpr Token::Value one_char_tokens[128] = {
#define CALL_GET_SCAN_FLAGS(N) GetOneCharToken(N),
        INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
#undef CALL_GET_SCAN_FLAGS
    };

#undef KEYWORDS

    V8_INLINE Token::Value Scanner::ScanIdentifierOrKeyword()
    {
        next().literal_chars.Start();
        return ScanIdentifierOrKeywordInner();
    }

    // Character flags for the fast path of scanning a keyword or identifier token.
    enum class ScanFlags : uint8_t {
        kTerminatesLiteral = 1 << 0,
        // "Cannot" rather than "can" so that this flag can be ORed together across
        // multiple characters.
        kCannotBeKeyword = 1 << 1,
        kCannotBeKeywordStart = 1 << 2,
        kStringTerminator = 1 << 3,
        kIdentifierNeedsSlowPath = 1 << 4,
        kMultilineCommentCharacterNeedsSlowPath = 1 << 5,
    };
    constexpr uint8_t GetScanFlags(char c)
    {
        return
            // Keywords are all lowercase and only contain letters.
            // Note that non-identifier characters do not set this flag, so
            // that it plays well with kTerminatesLiteral.
            (IsAsciiIdentifier(c) && !CanBeKeywordCharacter(c)
                    ? static_cast<uint8_t>(ScanFlags::kCannotBeKeyword)
                    : 0)
            | (IsKeywordStart(c)
                    ? 0
                    : static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart))
            |
            // Anything that isn't an identifier character will terminate the
            // literal, or at least terminates the literal fast path processing
            // (like an escape).
            (!IsAsciiIdentifier(c)
                    ? static_cast<uint8_t>(ScanFlags::kTerminatesLiteral)
                    : 0)
            |
            // Possible string termination characters.
            ((c == '\'' || c == '"' || c == '\n' || c == '\r' || c == '\\')
                    ? static_cast<uint8_t>(ScanFlags::kStringTerminator)
                    : 0)
            |
            // Escapes are processed on the slow path.
            (c == '\\' ? static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath)
                       : 0)
            |
            // Newlines and * are interesting characters for multiline comment
            // scanning.
            (c == '\n' || c == '\r' || c == '*'
                    ? static_cast<uint8_t>(
                        ScanFlags::kMultilineCommentCharacterNeedsSlowPath)
                    : 0);
    }
    inline bool TerminatesLiteral(uint8_t scan_flags)
    {
        return (scan_flags & static_cast<uint8_t>(ScanFlags::kTerminatesLiteral));
    }
    inline bool CanBeKeyword(uint8_t scan_flags)
    {
        return !(scan_flags & static_cast<uint8_t>(ScanFlags::kCannotBeKeyword));
    }
    inline bool IdentifierNeedsSlowPath(uint8_t scan_flags)
    {
        return (scan_flags & static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath));
    }
    inline bool MultilineCommentCharacterNeedsSlowPath(uint8_t scan_flags)
    {
        return (scan_flags & static_cast<uint8_t>(ScanFlags::kMultilineCommentCharacterNeedsSlowPath));
    }
    inline bool MayTerminateString(uint8_t scan_flags)
    {
        return (scan_flags & static_cast<uint8_t>(ScanFlags::kStringTerminator));
    }
    // Table of precomputed scan flags for the 128 ASCII characters, for branchless
    // flag calculation during the scan.
    static constexpr const uint8_t character_scan_flags[128] = {
#define CALL_GET_SCAN_FLAGS(N) GetScanFlags(N),
        INT_0_TO_127_LIST(CALL_GET_SCAN_FLAGS)
#undef CALL_GET_SCAN_FLAGS
    };

    inline bool CharCanBeKeyword(uc32 c)
    {
        return static_cast<uint32_t>(c) < arraysize(character_scan_flags) && CanBeKeyword(character_scan_flags[c]);
    }

    V8_INLINE Token::Value Scanner::ScanIdentifierOrKeywordInner()
    {
        DCHECK(IsIdentifierStart(c0_));
        bool escaped = false;
        bool can_be_keyword = true;

        STATIC_ASSERT(arraysize(character_scan_flags) == kMaxAscii + 1);
        if (V8_LIKELY(static_cast<uint32_t>(c0_) <= kMaxAscii)) {
            if (V8_LIKELY(c0_ != '\\')) {
                uint8_t scan_flags = character_scan_flags[c0_];
                DCHECK(!TerminatesLiteral(scan_flags));
                STATIC_ASSERT(static_cast<uint8_t>(ScanFlags::kCannotBeKeywordStart) == static_cast<uint8_t>(ScanFlags::kCannotBeKeyword) << 1);
                scan_flags >>= 1;
                // Make sure the shifting above doesn't set IdentifierNeedsSlowPath.
                // Otherwise we'll fall into the slow path after scanning the identifier.
                DCHECK(!IdentifierNeedsSlowPath(scan_flags));
                AddLiteralChar(static_cast<char>(c0_));
                AdvanceUntil([this, &scan_flags](uc32 c0) {
                    if (V8_UNLIKELY(static_cast<uint32_t>(c0) > kMaxAscii)) {
                        // A non-ascii character means we need to drop through to the slow
                        // path.
                        // TODO(leszeks): This would be most efficient as a goto to the slow
                        // path, check codegen and maybe use a bool instead.
                        scan_flags |= static_cast<uint8_t>(ScanFlags::kIdentifierNeedsSlowPath);
                        return true;
                    }
                    uint8_t char_flags = character_scan_flags[c0];
                    scan_flags |= char_flags;
                    if (TerminatesLiteral(char_flags)) {
                        return true;
                    } else {
                        AddLiteralChar(static_cast<char>(c0));
                        return false;
                    }
                });

                if (V8_LIKELY(!IdentifierNeedsSlowPath(scan_flags))) {
                    if (!CanBeKeyword(scan_flags))
                        return Token::IDENTIFIER;
                    // Could be a keyword or identifier.
                    Vector<const uint8_t> chars = next().literal_chars.one_byte_literal();
                    return KeywordOrIdentifierToken(chars.start(), chars.length());
                }

                can_be_keyword = CanBeKeyword(scan_flags);
            } else {
                // Special case for escapes at the start of an identifier.
                escaped = true;
                uc32 c = ScanIdentifierUnicodeEscape();
                DCHECK(!IsIdentifierStart(-1));
                if (c == '\\' || !IsIdentifierStart(c)) {
                    return Token::ILLEGAL;
                }
                AddLiteralChar(c);
                can_be_keyword = CharCanBeKeyword(c);
            }
        }

        return ScanIdentifierOrKeywordInnerSlow(escaped, can_be_keyword);
    }

    V8_INLINE Token::Value Scanner::SkipWhiteSpace()
    {
        int start_position = source_pos();

        // We won't skip behind the end of input.
        DCHECK(!IsWhiteSpaceOrLineTerminator(kEndOfInput));

        // Advance as long as character is a WhiteSpace or LineTerminator.
        while (IsWhiteSpaceOrLineTerminator(c0_)) {
            if (!next().after_line_terminator && unibrow::IsLineTerminator(c0_)) {
                next().after_line_terminator = true;
            }
            Advance();
        }

        // Return whether or not we skipped any characters.
        if (source_pos() == start_position) {
            DCHECK_NE('0', c0_);
            return Token::ILLEGAL;
        }

        return Token::WHITESPACE;
    }

    V8_INLINE Token::Value Scanner::ScanSingleToken()
    {
        Token::Value token;
        do {
            next().location.beg_pos = source_pos();

            if (V8_LIKELY(static_cast<unsigned>(c0_) <= kMaxAscii)) {
                token = one_char_tokens[c0_];

                switch (token) {
                case Token::LPAREN:
                case Token::RPAREN:
                case Token::LBRACE:
                case Token::RBRACE:
                case Token::LBRACK:
                case Token::RBRACK:
                case Token::CONDITIONAL:
                case Token::COLON:
                case Token::SEMICOLON:
                case Token::COMMA:
                case Token::BIT_NOT:
                case Token::ILLEGAL:
                    // One character tokens.
                    return Select(token);

                case Token::STRING:
                    return ScanString();

                case Token::LT:
                    // < <= << <<= <!--
                    Advance();
                    if (c0_ == '=')
                        return Select(Token::LTE);
                    if (c0_ == '<')
                        return Select('=', Token::ASSIGN_SHL, Token::SHL);
                    if (c0_ == '!') {
                        token = ScanHtmlComment();
                        continue;
                    }
                    return Token::LT;

                case Token::GT:
                    // > >= >> >>= >>> >>>=
                    Advance();
                    if (c0_ == '=')
                        return Select(Token::GTE);
                    if (c0_ == '>') {
                        // >> >>= >>> >>>=
                        Advance();
                        if (c0_ == '=')
                            return Select(Token::ASSIGN_SAR);
                        if (c0_ == '>')
                            return Select('=', Token::ASSIGN_SHR, Token::SHR);
                        return Token::SAR;
                    }
                    return Token::GT;

                case Token::ASSIGN:
                    // = == === =>
                    Advance();
                    if (c0_ == '=')
                        return Select('=', Token::EQ_STRICT, Token::EQ);
                    if (c0_ == '>')
                        return Select(Token::ARROW);
                    return Token::ASSIGN;

                case Token::NOT:
                    // ! != !==
                    Advance();
                    if (c0_ == '=')
                        return Select('=', Token::NE_STRICT, Token::NE);
                    return Token::NOT;

                case Token::ADD:
                    // + ++ +=
                    Advance();
                    if (c0_ == '+')
                        return Select(Token::INC);
                    if (c0_ == '=')
                        return Select(Token::ASSIGN_ADD);
                    return Token::ADD;

                case Token::SUB:
                    // - -- --> -=
                    Advance();
                    if (c0_ == '-') {
                        Advance();
                        if (c0_ == '>' && next().after_line_terminator) {
                            // For compatibility with SpiderMonkey, we skip lines that
                            // start with an HTML comment end '-->'.
                            token = SkipSingleHTMLComment();
                            continue;
                        }
                        return Token::DEC;
                    }
                    if (c0_ == '=')
                        return Select(Token::ASSIGN_SUB);
                    return Token::SUB;

                case Token::MUL:
                    // * *=
                    Advance();
                    if (c0_ == '*')
                        return Select('=', Token::ASSIGN_EXP, Token::EXP);
                    if (c0_ == '=')
                        return Select(Token::ASSIGN_MUL);
                    return Token::MUL;

                case Token::MOD:
                    // % %=
                    return Select('=', Token::ASSIGN_MOD, Token::MOD);

                case Token::DIV:
                    // /  // /* /=
                    Advance();
                    if (c0_ == '/') {
                        uc32 c = Peek();
                        if (c == '#' || c == '@') {
                            Advance();
                            Advance();
                            token = SkipSourceURLComment();
                            continue;
                        }
                        token = SkipSingleLineComment();
                        continue;
                    }
                    if (c0_ == '*') {
                        token = SkipMultiLineComment();
                        continue;
                    }
                    if (c0_ == '=')
                        return Select(Token::ASSIGN_DIV);
                    return Token::DIV;

                case Token::BIT_AND:
                    // & && &=
                    Advance();
                    if (c0_ == '&')
                        return Select(Token::AND);
                    if (c0_ == '=')
                        return Select(Token::ASSIGN_BIT_AND);
                    return Token::BIT_AND;

                case Token::BIT_OR:
                    // | || |=
                    Advance();
                    if (c0_ == '|')
                        return Select(Token::OR);
                    if (c0_ == '=')
                        return Select(Token::ASSIGN_BIT_OR);
                    return Token::BIT_OR;

                case Token::BIT_XOR:
                    // ^ ^=
                    return Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);

                case Token::PERIOD:
                    // . Number
                    Advance();
                    if (IsDecimalDigit(c0_))
                        return ScanNumber(true);
                    if (c0_ == '.') {
                        if (Peek() == '.') {
                            Advance();
                            Advance();
                            return Token::ELLIPSIS;
                        }
                    }
                    return Token::PERIOD;

                case Token::TEMPLATE_SPAN:
                    Advance();
                    return ScanTemplateSpan();

                case Token::PRIVATE_NAME:
                    return ScanPrivateName();

                case Token::WHITESPACE:
                    token = SkipWhiteSpace();
                    continue;

                case Token::NUMBER:
                    return ScanNumber(false);

                case Token::IDENTIFIER:
                    return ScanIdentifierOrKeyword();

                default:
                    UNREACHABLE();
                }
            }

            if (IsIdentifierStart(c0_) || (CombineSurrogatePair() && IsIdentifierStart(c0_))) {
                return ScanIdentifierOrKeyword();
            }
            if (c0_ == kEndOfInput) {
                return source_->has_parser_error() ? Token::ILLEGAL : Token::EOS;
            }
            token = SkipWhiteSpace();

            // Continue scanning for tokens as long as we're just skipping whitespace.
        } while (token == Token::WHITESPACE);

        return token;
    }

    void Scanner::Scan(TokenDesc* next_desc)
    {
        DCHECK_EQ(next_desc, &next());

        next_desc->token = ScanSingleToken();
        DCHECK_IMPLIES(has_parser_error(), next_desc->token == Token::ILLEGAL);
        next_desc->location.end_pos = source_pos();

#ifdef DEBUG
        SanityCheckTokenDesc(current());
        SanityCheckTokenDesc(next());
        SanityCheckTokenDesc(next_next());
#endif
    }

    void Scanner::Scan() { Scan(next_); }

} // namespace internal
} // namespace v8

#endif // V8_PARSING_SCANNER_INL_H_
